# importing packages
import pandas_profiling as pp
# importing pandas
import pandas as pd
# importing plotly
import plotly.express as px
# Reading marketing_campaign csv file in a dataframe using pandas
df_marketing_campaign = pd.read_csv(r'marketing_campaign.csv')
# Creating a statistical profile report in html format using pandas_profiling
profile = pp.ProfileReport(df_marketing_campaign)
profile.to_file("pandas_profiling_marketing_campaign.html")
# Inspecting data by displaying the first few rows in the dataframe
df_marketing_campaign.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
# Descriptive statistics summary using describe function of the dataframe
df_marketing_campaign.describe()
| ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2240.000000 | 2240.000000 | 2216.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | ... | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.0 | 2240.0 | 2240.000000 |
| mean | 5592.159821 | 1968.805804 | 52247.251354 | 0.444196 | 0.506250 | 49.109375 | 303.935714 | 26.302232 | 166.950000 | 37.525446 | ... | 5.316518 | 0.072768 | 0.074554 | 0.072768 | 0.064286 | 0.013393 | 0.009375 | 3.0 | 11.0 | 0.149107 |
| std | 3246.662198 | 11.984069 | 25173.076661 | 0.538398 | 0.544538 | 28.962453 | 336.597393 | 39.773434 | 225.715373 | 54.628979 | ... | 2.426645 | 0.259813 | 0.262728 | 0.259813 | 0.245316 | 0.114976 | 0.096391 | 0.0 | 0.0 | 0.356274 |
| min | 0.000000 | 1893.000000 | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 25% | 2828.250000 | 1959.000000 | 35303.000000 | 0.000000 | 0.000000 | 24.000000 | 23.750000 | 1.000000 | 16.000000 | 3.000000 | ... | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 50% | 5458.500000 | 1970.000000 | 51381.500000 | 0.000000 | 0.000000 | 49.000000 | 173.500000 | 8.000000 | 67.000000 | 12.000000 | ... | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 75% | 8427.750000 | 1977.000000 | 68522.000000 | 1.000000 | 1.000000 | 74.000000 | 504.250000 | 33.000000 | 232.000000 | 50.000000 | ... | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| max | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | ... | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.0 | 11.0 | 1.000000 |
8 rows × 26 columns
# Learning about the missing values and data types in the dataframe
df_marketing_campaign.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Complain 2240 non-null int64 26 Z_CostContact 2240 non-null int64 27 Z_Revenue 2240 non-null int64 28 Response 2240 non-null int64 dtypes: float64(1), int64(25), object(3) memory usage: 507.6+ KB
# Learning about all the unique values in Education column
df_marketing_campaign['Education'].unique()
array(['Graduation', 'PhD', 'Master', 'Basic', '2n Cycle'], dtype=object)
# Learning about all the unique values in Marital_Status column
df_marketing_campaign['Marital_Status'].unique()
array(['Single', 'Together', 'Married', 'Divorced', 'Widow', 'Alone',
'Absurd', 'YOLO'], dtype=object)
# Changing the date from string format to date format
df_marketing_campaign["Date_Customer"] = pd.to_datetime(df_marketing_campaign["Dt_Customer"])
# Calculating a new column of purchase by adding the amount spent on wines, fruits, meat, fish, gold, sweet
df_marketing_campaign["Total_Purchase"] = df_marketing_campaign["MntWines"] + df_marketing_campaign["MntFruits"] + \
df_marketing_campaign["MntMeatProducts"] + df_marketing_campaign["MntFishProducts"] + df_marketing_campaign["MntGoldProds"] \
+ df_marketing_campaign["MntSweetProducts"]
# Checking all the columns of the modified dataframe
df_marketing_campaign.columns
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
'Date_Customer', 'Total_Purchase'],
dtype='object')
# Displaying the top rows of the modified dataframe
df_marketing_campaign.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | Date_Customer | Total_Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 2012-09-04 | 1617 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 2014-03-08 | 27 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 2013-08-21 | 776 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 2014-02-10 | 53 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 2014-01-19 | 422 |
5 rows × 31 columns
# Count of users sorted by the year on which they where born
agewise_born_users = df_marketing_campaign.groupby(['Year_Birth'])['ID'].nunique()
agewise_born_users.head()
Year_Birth 1893 1 1899 1 1900 1 1940 1 1941 1 Name: ID, dtype: int64
# Scatterplot represents the response of customers w.r.t. their birth year and their education level
"""
https://plotly.com/python-api-reference/generated/plotly.express.scatter.html
plotly.express.scatter(data_frame=None, x=None, y=None, color=None, symbol=None, size=None, hover_name=None, hover_data=None,\
custom_data=None, text=None, facet_row=None, facet_col=None, facet_col_wrap=0, facet_row_spacing=None,\
facet_col_spacing=None, error_x=None, error_x_minus=None, error_y=None, error_y_minus=None,\
animation_frame=None, animation_group=None, category_orders=None, labels=None, orientation=None,\
color_discrete_sequence=None, color_discrete_map=None, color_continuous_scale=None, range_color=None,\
color_continuous_midpoint=None, symbol_sequence=None, symbol_map=None, opacity=None, size_max=None,\
marginal_x=None, marginal_y=None, trendline=None, trendline_options=None, trendline_color_override=None,\
trendline_scope='trace', log_x=False, log_y=False, range_x=None, range_y=None, render_mode='auto',\
title=None, template=None, width=None, height=None)
"""
fig = px.scatter(df_marketing_campaign, x="Year_Birth", y="Education", color='Response')
fig.show()
# Scatterplot represents the date w.r.t. the amount spent on wines, fruits, meat, fish, gold, sweet
"""
https://plotly.com/python-api-reference/generated/plotly.express.scatter.html
plotly.express.scatter(data_frame=None, x=None, y=None, color=None, symbol=None, size=None, hover_name=None, hover_data=None,\
custom_data=None, text=None, facet_row=None, facet_col=None, facet_col_wrap=0, facet_row_spacing=None,\
facet_col_spacing=None, error_x=None, error_x_minus=None, error_y=None, error_y_minus=None,\
animation_frame=None, animation_group=None, category_orders=None, labels=None, orientation=None,\
color_discrete_sequence=None, color_discrete_map=None, color_continuous_scale=None, range_color=None,\
color_continuous_midpoint=None, symbol_sequence=None, symbol_map=None, opacity=None, size_max=None,\
marginal_x=None, marginal_y=None, trendline=None, trendline_options=None, trendline_color_override=None,\
trendline_scope='trace', log_x=False, log_y=False, range_x=None, range_y=None, render_mode='auto',\
title=None, template=None, width=None, height=None)
"""
fig = px.scatter(df_marketing_campaign, x='Date_Customer', y=['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', \
'MntSweetProducts', 'MntGoldProds'])
fig.show()
# Scatterplot represents the date w.r.t. purchase amount of different items and their respective response
"""
https://plotly.com/python-api-reference/generated/plotly.express.scatter.html
plotly.express.scatter(data_frame=None, x=None, y=None, color=None, symbol=None, size=None, hover_name=None, hover_data=None,\
custom_data=None, text=None, facet_row=None, facet_col=None, facet_col_wrap=0, facet_row_spacing=None,\
facet_col_spacing=None, error_x=None, error_x_minus=None, error_y=None, error_y_minus=None,\
animation_frame=None, animation_group=None, category_orders=None, labels=None, orientation=None,\
color_discrete_sequence=None, color_discrete_map=None, color_continuous_scale=None, range_color=None,\
color_continuous_midpoint=None, symbol_sequence=None, symbol_map=None, opacity=None, size_max=None,\
marginal_x=None, marginal_y=None, trendline=None, trendline_options=None, trendline_color_override=None,\
trendline_scope='trace', log_x=False, log_y=False, range_x=None, range_y=None, render_mode='auto',\
title=None, template=None, width=None, height=None)
"""
fig = px.scatter(df_marketing_campaign, x='Date_Customer', y=['MntWines','MntMeatProducts', 'MntGoldProds'] , color='Response')
fig.show()
# Scatterplot represents the date w.r.t. purchase mode counts and their respective response, education level
"""
https://plotly.com/python-api-reference/generated/plotly.express.scatter.html
plotly.express.scatter(data_frame=None, x=None, y=None, color=None, symbol=None, size=None, hover_name=None, hover_data=None,\
custom_data=None, text=None, facet_row=None, facet_col=None, facet_col_wrap=0, facet_row_spacing=None,\
facet_col_spacing=None, error_x=None, error_x_minus=None, error_y=None, error_y_minus=None,\
animation_frame=None, animation_group=None, category_orders=None, labels=None, orientation=None,\
color_discrete_sequence=None, color_discrete_map=None, color_continuous_scale=None, range_color=None,\
color_continuous_midpoint=None, symbol_sequence=None, symbol_map=None, opacity=None, size_max=None,\
marginal_x=None, marginal_y=None, trendline=None, trendline_options=None, trendline_color_override=None,\
trendline_scope='trace', log_x=False, log_y=False, range_x=None, range_y=None, render_mode='auto',\
title=None, template=None, width=None, height=None)
"""
fig = px.scatter(df_marketing_campaign, x='Date_Customer', y=['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',\
'NumStorePurchases', 'NumWebVisitsMonth'], color='Response',\
facet_col= 'Education')
fig.show()
# Calculating the mean purchase for both positive and negative responses
df_avg_marketing_campaign = pd.DataFrame(df_marketing_campaign.groupby(['Response']).aggregate({'Total_Purchase':'mean'}))
df_avg_marketing_campaign
| Total_Purchase | |
|---|---|
| Response | |
| 0 | 538.929171 |
| 1 | 987.392216 |
# Calculating the count of users which responded positively to any of the 5 campaigns but responded negatively to the\
# current campaign
print(df_marketing_campaign.query('Response == 0 & AcceptedCmp1 == 1').aggregate({'AcceptedCmp1': 'count'}))
print("\n")
print(df_marketing_campaign.query('Response == 0 & AcceptedCmp2 == 1').aggregate({'AcceptedCmp2': 'count'}))
print("\n")
print(df_marketing_campaign.query('Response == 0 & AcceptedCmp3 == 1').aggregate({'AcceptedCmp3': 'count'}))
print("\n")
print(df_marketing_campaign.query('Response == 0 & AcceptedCmp4 == 1').aggregate({'AcceptedCmp4': 'count'}))
print("\n")
print(df_marketing_campaign.query('Response == 0 & AcceptedCmp5 == 1').aggregate({'AcceptedCmp5': 'count'}))
print("\n")
print("Accepted Either of the Campaign Order and still did not respond to the current campaign")
print(df_marketing_campaign.query('Response == 0 & (AcceptedCmp1 == 1 | AcceptedCmp2 == 1 | AcceptedCmp3 == 1 | \
AcceptedCmp4 == 1 | AcceptedCmp5 == 1)').aggregate({'Response': 'count'}))
AcceptedCmp1 65 dtype: int64 AcceptedCmp2 10 dtype: int64 AcceptedCmp3 86 dtype: int64 AcceptedCmp4 105 dtype: int64 AcceptedCmp5 71 dtype: int64 Accepted Either of the Campaign Order and still did not respond to the current campaign Response 275 dtype: int64